home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
CICA 1993 April
/
CICA MS Windows - April 1993.iso
/
unzipped
/
programr
/
listings
/
ptv2n5
/
asmcoun1.pas
next >
Wrap
Pascal/Delphi Source File
|
1991-09-18
|
23KB
|
619 lines
{ ==========================================================================
Title: AsmCount
Programmer: David Neal Dubois
Description: Unit AsmCount provides objects for word counting.
Three objects are provided: CountObj, which counts
words in an ASCII file; SprintCountObj, which can count
words in a file created by Borland's Sprint word
processor; and AmiCountObj, designed to count words
in AmiPro files.
A word is defined as a sequence of non-separators. The
non-separators are the letters, digits, and apostrophe.
The counters for Sprint and AmiPro parse for control code
sequences. A control code is treated as a separator.
The formats in which Sprint and AmiPro store control codes
were determined emperically by David Gerrold. Sprint
brackets control codes by ASCII codes 15 and 14, (Ctrl-O and
Ctrl-N). Control codes may be nested. AmiPro indicates the
beginning of a control code by one of these two character
sequences: '<@', '<+', or '<-'. The code is terminated by
a '>'.
To use the counters, first initialize a *CountObj using the
Init constructor passing the file name as a parameter.
Then call the Count method. When the count method returns
the field WordCount will hold the number of words in the
file. Example:
var
Counter : CountObj;
begin
with Counter do
begin
Init ( 'Test' );
Count;
writeln ( 'number of words: ', WordCount );
Done;
end;
end.
The Count methods are written in assembly language for
maximum speed. Counting the words in a one megabyte ASCII
text file stored on RAM disk on a 386, 25 MHz machine takes
about 1.5 seconds.
Method: Count reads from the file in blocks of 65520 characters at
a time. As each character is processed, Count looks checks
a look-up table to determine whether the character is a
separator. When a non-separator is found followed by a
separator, a word is counted.
========================================================================== }
unit AsmCoun1;
interface
{ -----------------------------------------------------------------------
BlockSize - The maximum number of characters to be read from a file
at one time. This is set to 65520, the largest block
which can be allocated from the Turbo Pascal heap.
Init/Done - The Init constructor opens the file. The Done destructor
closes it.
Count - This method performs the actual word count.
InitCount - Used internally by Count. This method initializes the
look-up table, ensures the file is at its beginning
(in case Count was called twice), and allocates a block
of memory from the heap to act as a file buffer.
FiniCount - Used internally by Count. Disposes of the file buffer.
ReadBlock - Used by Count to read a block from the file. At this time
the ShowProgress method is called.
ShowProgress - Gives the user some indication of the Count method's
progress. Here it simply prints a period as each block
is read. This could changed to produce a percentage-bar.
----------------------------------------------------------------------- }
const
BlockSize = 65520;
type
BlockType = array [ 1 .. BlockSize ] of char;
BlockPtr = ^ BlockType;
CountPtr = ^ CountObj;
CountObj = object
TheFile : file;
WordCount : longint;
constructor Init ( FileName : string );
destructor Done;
procedure Count; virtual;
procedure InitCount; virtual;
procedure FiniCount; virtual;
procedure ReadBlock;
procedure ShowProgress;
end;
SprintCountPtr = ^ SprintCountObj;
SprintCountObj = object ( CountObj )
procedure Count; virtual;
procedure InitCount; virtual;
end;
AmiCountPtr = ^ SprintCountObj;
AmiCountObj = object ( CountObj )
procedure Count; virtual;
end;
implementation
{ -----------------------------------------------------------------------
These global variables are used internally by the Count method. Storing
these in the global data segment simplifies the assembly code which
accesses them.
Block - A pointer to the file buffer allocated by InitCount.
Table - The look-up table used to determine whether a character
is a separator.
Actual - When a block is read from a file, the actual number of bytes
which were read.
SaveBP - SprintObj.Count uses the BP register internally. It stores
the original value here for safe keeping.
----------------------------------------------------------------------- }
var
Block : BlockPtr;
Table : array [ char ] of byte;
Actual : word;
SaveBP : word;
{ -----------------------------------------------------------------------
ShowProgress - Let the user know that progress is being made.
----------------------------------------------------------------------- }
procedure CountObj . ShowProgress;
begin
{ write ( '.' ); } {Can't use this within a DLL!!! }
end;
{ -----------------------------------------------------------------------
Init - Open a binary file with a record size of 1 byte.
----------------------------------------------------------------------- }
constructor CountObj . Init ( FileName : string );
begin
assign ( TheFile, FileName );
reset ( TheFile, 1 );
WordCount := 0;
end;
{ -----------------------------------------------------------------------
Done - Close the file.
----------------------------------------------------------------------- }
destructor CountObj . Done;
begin
close ( TheFile );
end;
{ -----------------------------------------------------------------------
InitCount - performs three tasks.
[1] Ensure the file pointer is set to the beginning of the
file.
[2] Allocate a file buffer from the heap.
[3] Generate the look-up table used to determine whether
a character is a separator. A separator is indicated
by a one stored in the table, while a non-separator
is indicated by a zero.
----------------------------------------------------------------------- }
procedure CountObj . InitCount;
var
C : char;
begin
seek ( TheFile, 0 );
new ( Block );
for C := #0 to #255 do
if C in [ '''', '0' .. '9', 'A' .. 'Z', 'a' .. 'z' ] then
Table [ C ] := 0
else
Table [ C ] := 1;
end;
{ -----------------------------------------------------------------------
FiniCount - Dispose of the file buffer.
----------------------------------------------------------------------- }
procedure CountObj . FiniCount;
begin
dispose ( Block );
end;
{ -----------------------------------------------------------------------
ReadBlock - Read a block from the file into the buffer. Call
ShowProgress so the user knows what's happening.
----------------------------------------------------------------------- }
procedure CountObj . ReadBlock;
begin
ShowProgress;
blockread ( TheFile, Block ^, BlockSize, Actual );
end;
{ -----------------------------------------------------------------------
Count - Count the words in an ASCII text file.
Register usage: AL - indicates whether the current character is a
separator. One for separator, zero for
non-separator.
AH - indicates whether the previous character was a
separator.
BX - points to look-up table.
CX - number of characters left to be processed in
buffer.
DX:DI - 32-bit word count
ES:SI - points to next character to be processed.
----------------------------------------------------------------------- }
procedure CountObj . Count;
var
TempCount : longint;
begin
InitCount;
asm
cld { Clear direction flag. }
xor DX, DX { Set word count to zero. }
mov DI, DX
mov AX, 0101h { Treat beginning of file as separators. }
call @CallReadBlock { Fill file buffer. }
@ProcessNormalChar:
mov AH, AL { Set last character separator flag. }
seges lodsb { Read a character from the buffer. }
xlat { Check look-up table. }
cmp AX, 0001h { AX is 1 if this character is a separator }
{ and the previous character isn't. }
jz @CountWord { If this is the case, count a word. }
loop @ProcessNormalChar { Process the next character. }
call @CallReadBlock { If we've processed the whole buffer then }
jmp @ProcessNormalChar { fill it again. }
@CountWord:
add DI, 1 { Increment the word counter. }
adc DX, 0
loop @ProcessNormalChar { Process the next character. }
call @CallReadBlock
jmp @ProcessNormalChar
@CallReadBlock:
pushf { Before calling the ReadBlock method, }
push AX { save current state of registers. }
push DX
push DI
end; { asm }
ReadBlock; { Read a block of characters into buffer. }
asm
pop DI { Restore registers. }
pop DX
pop AX
popf
mov BX, offset Table { Set BX to point to look-up table. }
les SI, [ Block ] { Load address of file buffer into ES:SI. }
mov CX, [ Actual ] { Find out number of characters in buffer. }
jcxz @EndOfFile { If there are no characters in buffer, }
{ then the end of file has been reached. }
retn { Return from CallReadBlock. }
@EndOfFile:
add SP, 2 { Pop and ignore CallReadBlock's return }
{ address. }
cmp AL, 0 { Special consideration must be given to }
jnz @Fini { the end-of-file. If the last character }
add DI, 1 { processed was not a separator, count }
adc DX, 0 { a word. }
@Fini:
mov [ word ptr TempCount ], DI { Store the result in }
mov [ word ptr TempCount + 2 ], DX { TempCount. }
end; { asm }
FiniCount; { Dispose of file buffer. }
WordCount := TempCount; { Return word count. }
end;
{ -----------------------------------------------------------------------
The Sprint counter is similar to the ASCII counter, except that it
parses for Sprint control codes. Sprint brackets control codes by
Ctrl-O and Ctrl-N. These control codes may be nested. Processing
proceeds identically to that of the ASCII counter, until a Ctrl-O is
found.
When a control code sequence is found, the Sprint counter enters
another loop. The loop processes characters, keeping count of the
depth of control codes, adding one each time a ^O is encountered,
and subtracting one each time a ^N is found. When the count reaches
zero, the control code is terminated and control returns to normal
word-counting loop.
----------------------------------------------------------------------- }
{ -----------------------------------------------------------------------
InitCount - The Sprint counter's InitCount performs the same functions
as the ASCII InitCount, except that a special value is
placed in the look-up table for Ctrl-O to indicate the
start of control-code processing.
----------------------------------------------------------------------- }
procedure SprintCountObj . InitCount;
begin
CountObj . InitCount;
Table [ ^O ] := $FF;
end;
{ -----------------------------------------------------------------------
Count - Count the words in a Sprint file. Comments annotate differences
between this and CountObj.Count.
There are two pieces of code in this procedure which are kind of
tricky. These were designed to speed and simplify processing, but
they'll require some explanation.
------------------------------------------------------------------
Previous char Current char AH AL Need to do
------------------------------------------------------------------
separator separator 01 01 nothing
separator non-separator 01 00 nothing
separator Ctrl-O 01 FF process control code
non-separator separator 00 01 count a word
non-separator non-separator 00 00 nothing
non-separator Ctrl-O 00 FF process control code
------------------------------------------------------------------
This table shows how the main loop must handle different
situations that will occur. The focus of this code is to maximum
speed. Therefore, the codes for the table were carefully chosen so
that the more common "do nothing" situations can be distinguighed
from the others with a single comparison. Once it has been
determined that something must be done, it can be determined without
another comparison.
cmp AL, AH
ja @CountOrControlCode
<...do-nothing processing...>
@CountOrControlCode:
js <...control-code processing...>
<...count a word...>
The other bit of tricky code is in the control code sequence processing
loop. It is necessary to add one to a counter if character number 15
is found, and to subtract one if character 14 is found. Then, the
loop must be terminated if the count has reached zero. This task is
performed by the following code. (BP holds the depth counter.):
sub AL, 14
sub AL, 1
adc BP, 0
sub AL, 1
sbb BP, 0
jz <...loop complete...>
----------------------------------------------------------------------- }
procedure SprintCountObj . Count;
var
TempCount : longint;
begin
InitCount;
asm
cld
mov [ SaveBP ], BP { When it comes time to process a control }
{ code sequence, the depth counter is }
{ stored in BP. The original value of BP }
{ must be restored later when the method }
{ is complete, and also before ReadBlock }
{ is called. }
xor DX, DX
mov DI, DX
mov AX, 0101h
call @CallReadBlock
@ProcessNormalChar:
mov AH, AL
seges lodsb
xlat
@Continue:
cmp AL, AH { See comments above. }
ja @CountOrControlCode
loop @ProcessNormalChar
call @CallReadBlock
jmp @ProcessNormalChar
@CountOrControlCode:
js @ControlCode
add DI, 1 { Count a word. }
adc DX, 0
loop @ProcessNormalChar
call @CallReadBlock
jmp @ProcessNormalChar
@ControlCode:
mov BP, 1 { Process a control code sequence. A }
jmp @EndControlCodeLoop { control sequence begins with ^O and }
{ ends with ^N. Control codes may be }
{ nested. Therefore it is necessary to }
{ keep a depth counter. As ^O's are }
{ processed, the counter will be }
{ incremented. ^N's will decrement the }
{ counter. When the counter reaches }
{ zero, the control code is complete. }
{ The depth counter will be stored in }
{ BP. Since a ^O has already been }
{ found, the counter is set to 1. }
@ProcessControlCodeChar:
seges lodsb { Get next character. }
sub AL, 14 { See comments above. }
sub AL, 1
adc BP, 0
sub AL, 1
sbb BP, 0
jz @ControlCodeDone
@EndControlCodeLoop:
loop @ProcessControlCodeChar { Process next char. }
mov AL, AH { This line is necessary to ensure }
{ correct processing should the }
{ end-of-file already be reached. }
call @CallReadBlock
jmp @ProcessControlCodeChar
@ControlCodeDone:
mov AL, 1 { Return to normal processing as if a }
jmp @Continue { a separator had been found. }
@CallReadBlock:
pushf
push AX
push DX
push DI
push BP { Save the depth counter, and restore BP }
mov BP, [ SaveBP ] { will ReadBlock is called. }
end; { asm }
ReadBlock;
asm
pop BP
pop DI
pop DX
pop AX
popf
mov BX, offset Table
les SI, [ Block ]
mov CX, [ Actual ]
jcxz @EndOfFile
retn
@EndOfFile:
pop CX
cmp AL, 1
jz @Fini
add DI, 1
adc DX, 0
@Fini:
mov BP, [ SaveBP ] { Restore original BP. }
mov [ word ptr TempCount ], DI
mov [ word ptr TempCount + 2 ], DX
end; { asm }
FiniCount;
WordCount := TempCount;
end;
{ -----------------------------------------------------------------------
AmiPro control code sequences begin with '<', followed by one of '@',
'+' or '-'. The sequence terminates with a '>'. AmiCountObj.Count
processes characters the same as CountObj.Count except that it checks
each character to see if it is a '<'. When a '<' is encountered,
another character is read and checked against '@', '+' and '-'. If
one of these is found, Count enters a loop looking for '>'.
----------------------------------------------------------------------- }
procedure AmiCountObj . Count;
var
TempCount : longint;
begin
InitCount;
asm
cld
xor DX, DX
mov DI, DX
mov AX, 0101h
call @CallReadBlock
@ProcessNormalChar:
mov AH, AL
seges lodsb
cmp AL, '<' { Check to see if character is '<'. If }
jz @PerhapsControlCode { it is, a control code sequence may be }
{ beginning. }
@Continue:
xlat
cmp AX, 0001h
jz @CountWord
loop @ProcessNormalChar
call @CallReadBlock
jmp @ProcessNormalChar
@CountWord:
add DI, 1
adc DX, 0
loop @ProcessNormalChar
call @CallReadBlock
jmp @ProcessNormalChar
@ProcessCharAfterLT:
seges lodsb { Read the character that follows '<'. }
cmp AL, '@' { Check to see if it is '@', '+' or '-'. }
jz @ProcessControlCode
cmp AL, '+'
jz @ProcessControlCode
cmp AL, '-'
jz @ProcessControlCode
jmp @Continue
@PerhapsControlCode:
loop @ProcessCharAfterLT
mov AL, AH
call @CallReadBlock
jmp @ProcessCharAfterLT
@ProcessControlCodeChar:
seges lodsb { A control code sequence has begun. }
cmp AL, '>' { Process characters until a '>' is }
jz @Continue { found. }
@ProcessControlCode:
loop @ProcessControlCodeChar
mov AL, AH
call @CallReadBlock
jmp @ProcessControlCodeChar
@CallReadBlock:
pushf
push AX
push DX
push DI
end; { asm }
ReadBlock;
asm
pop DI
pop DX
pop AX
popf
mov BX, offset Table
les SI, [ Block ]
mov CX, [ Actual ]
jcxz @EndOfFile
retn
@EndOfFile:
add SP, 2
cmp AL, 0
jnz @Fini
add DI, 1
adc DX, 0
@Fini:
mov [ word ptr TempCount ], DI
mov [ word ptr TempCount + 2 ], DX
end; { asm }
FiniCount;
WordCount := TempCount;
end;
{begin}
end.